Exploratory Data Analysis (EDA)

This R markdown covers the Exploratory Data Analysis (EDA) on the training dataset of the PRD.

Setting-up

## Loading the needed libraries

library(kableExtra)      # help you build common complex tables and manipulate table styles
library(tidyverse)       # for general data wrangling (includes readr and dplyr)
library(ggplot2)         # to draw statistical plots 
library(plotly)          # to construct interactive 3d plots
library(DataExplorer)    # automated data exploration
library(corrplot)        # to plot nice correlation matrix
library(caret)           # includes several functions to pre-process
library(scales)          # to determining breaks and labels for axes and legends
library(skimr)
library(funModeling) 
library(Hmisc)
library(grid)
library(hrbrthemes)
library(tidyr)
library(viridis)
library(ggpubr)
library(ggthemes)
library(GGally)
library(nortest)
## Loading the training dataset

load("~/GitHub/ff-beta-release-matching/poc/EDA/data_milestone2_df_train_validate_20191025.RData")

Training

## View train dataframe 

kable(head(df_train_f)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
client_id num_active_days content_crashes active_hours uri_count session_length search_count num_bookmarks num_pages daily_unique_domains daily_max_tabs daily_tabs_opened startup_ms daily_num_sessions_started active_hours_max uri_count_max session_length_max search_count_max num_pages_max daily_unique_domains_max daily_max_tabs_max daily_tabs_opened_max startup_ms_max daily_num_sessions_started_max label install_year profile_age fxa_configured sync_configured is_default_browser locale normalized_channel app_version default_search_engine country timezone_offset num_addons cpu_cores cpu_speed_mhz cpu_l2_cache_kb cpu_vendor memory_mb os_version is_wow64 FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS profile_age_cat distro_id_norm timezone_cat memory_cat cpu_speed_cat cpu_cores_cat is_release cpu_l2_cache_kb_cat
001cf926-92e3-4587-887e-d3156ba24d82 8 0 1.4215278 76.1250 22.9337499 1.875 11.00 4014.5 6.062500 7.00000 15.00 54176.2500 0.6250000 3.3208333 139 37.007500 6 15464 17.000000 11 29 180095 2 beta 2016 1160 False False True en-US beta 67 DuckDuckGo US -240 8 2 2527 256 Intel 4022 6.1 False 4223.089 5220.036 9079.136 5221.752 6157.840 5198.1300 < 5 years Mozilla (-6,-4] < 4GB < 3GHz 2 FALSE < 256
00210163-2123-427e-bb73-398bda9f9eba 5 0 0.8305556 168.2000 2.4390556 0.800 248.75 20599.5 7.866667 3.40000 14.80 3164.0667 1.2000000 1.6708333 325 8.318333 3 20719 17.000000 4 33 4966 2 beta 2016 1079 False False False en-US beta 67 DuckDuckGo GB 60 6 2 2394 256 Intel 3810 6.1 False 2148.350 2253.526 1159.979 2146.827 1155.050 1015.9784 < 5 years Mozilla (0,2] < 4GB < 3GHz 2 FALSE < 256
0024fd24-4ef5-4771-850a-9e3846597015 2 0 0.5111111 82.0000 0.8712505 2.500 9.00 87.0 2.166667 4.00000 8.00 23977.9444 5.0000000 0.8250000 145 1.464445 5 87 3.333333 6 15 31918 9 beta 2019 745 False False True en-US beta 67 Google GB 60 6 4 2394 256 Intel 8124 10.0 False 2699.834 2216.994 1832.642 2119.277 1819.612 1979.7910 < 5 years Mozilla (0,2] < 16GB < 3GHz < 4 FALSE < 256
004f70f7-2576-4de5-94b4-5bf1acdca0a8 8 0 0.3946181 101.8750 4.6785415 7.750 87.00 8882.0 7.750000 9.25000 19.50 1703.3125 1.3750000 1.1625000 210 9.174722 10 9044 13.000000 12 37 3454 2 beta 2018 130 True True True en-US beta 67 Google US -240 10 4 3991 256 Intel 16235 10.0 False 2370.563 2368.195 1614.073 2356.307 1497.652 850.0551 < 6 months Mozilla (-6,-4] < 16GB < 4GHz < 4 FALSE < 256
007c0c11-38e4-476b-a494-d732e15ac159 4 0 0.5930556 167.5000 5.6081245 0.250 13.00 3024.0 1.775000 8.75000 13.75 15285.5375 2.2500000 0.8791667 255 9.059722 1 3552 2.500000 27 36 24592 3 beta 2018 293 False False False en-US beta 67 Google US 360 7 2 2659 3072 Intel 3317 10.0 False 4050.417 5106.575 3630.348 5041.016 3618.008 2006.3376 < 2 years Mozilla (4,6] < 4GB < 3GHz 2 FALSE > 1024
0294837f-c98f-44ab-8237-30d2eba6c55a 6 0 1.6333333 165.8333 32.4127778 5.500 17.00 7740.7 8.583333 10.33333 20.00 975.2222 0.1666667 3.2319444 323 41.553611 12 7975 13.000000 18 35 1088 1 beta 2019 502 False False True en-US beta 67 other (non-bundled) GB 60 7 4 1800 256 Intel 8026 10.0 False 2740.793 2170.362 1482.620 2033.104 1161.901 1034.0580 < 2 years Mozilla (0,2] < 16GB < 2GHz < 4 FALSE < 256

Validation

## View train dataframe 

kable(head(df_validate_f)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
client_id num_active_days content_crashes active_hours uri_count session_length search_count num_bookmarks num_pages daily_unique_domains daily_max_tabs daily_tabs_opened startup_ms daily_num_sessions_started active_hours_max uri_count_max session_length_max search_count_max num_pages_max daily_unique_domains_max daily_max_tabs_max daily_tabs_opened_max startup_ms_max daily_num_sessions_started_max label install_year profile_age fxa_configured sync_configured is_default_browser locale normalized_channel app_version default_search_engine country timezone_offset num_addons cpu_cores cpu_speed_mhz cpu_l2_cache_kb cpu_vendor memory_mb os_version is_wow64 FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS profile_age_cat distro_id_norm timezone_cat memory_cat cpu_speed_cat cpu_cores_cat is_release cpu_l2_cache_kb_cat
001cf926-92e3-4587-887e-d3156ba24d82 8 0 1.5369792 71.12500 23.6797916 3.375000 11 1890.571 9.216667 8.500000 16.625 9928.483 0.6250000 2.3250000 120 34.020000 6 2094 20 14 31 17491.667 3 beta 2016 1204 False False True en-US beta 68 DuckDuckGo US -240 7 2 2527 256 Intel 4022 6.1 False 3133.947 3713.308 3748.471 3730.944 2444.799 1972.6632 < 5 years Mozilla (-6,-4] < 4GB < 3GHz 2 FALSE < 256
00210163-2123-427e-bb73-398bda9f9eba 2 0 0.1833333 43.50000 0.5619445 1.500000 259 22005.000 5.000000 3.000000 6.000 5413.500 1.0000000 0.2152778 48 0.781111 2 22005 7 4 7 9579.000 1 beta 2016 1124 False False False en-US beta 68 DuckDuckGo GB 60 5 2 2394 256 Intel 3810 6.1 False 3226.048 2561.596 1346.836 2523.810 1385.350 935.6222 < 5 years Mozilla (0,2] < 4GB < 3GHz 2 FALSE < 256
007c0c11-38e4-476b-a494-d732e15ac159 2 0 0.2423611 89.00000 6.6299995 0.000000 15 7203.000 1.000000 6.500000 9.500 7041.667 1.0000000 0.3222222 99 9.959722 0 7203 1 11 12 9194.333 2 beta 2018 336 False False False en-US beta 68 Google US 360 6 2 2659 3072 Intel 3317 10.0 False 4400.155 7244.930 3711.445 7280.457 3929.289 1474.5607 < 2 years Mozilla (4,6] < 4GB < 3GHz 2 FALSE > 1024
009ca4e9-874a-4c3e-983d-af0923346efb 3 0 0.1365741 29.66667 0.2358330 0.000000 11 2112.000 1.666667 1.666667 1.000 4644.333 1.6666667 0.1958333 53 0.387222 0 2112 2 2 1 6964.500 2 beta 2013 1606 False False True en-US beta 68 Google GB 360 6 2 2594 256 Intel 3965 6.2 False 5909.683 11043.408 5398.452 10410.288 5860.450 4538.1379 < 5 years Mozilla (4,6] < 4GB < 3GHz 2 FALSE < 256
0101d568-0c63-4492-9295-ed57ef78207f 3 0 0.3435185 15.66667 24.0802773 0.000000 7 17.000 1.000000 1.666667 1.500 4214.833 0.3333333 0.5083333 19 39.200277 0 18 1 2 2 4215.000 1 beta 2017 6 False False False en-US beta 68 Google US -420 5 4 3093 256 Intel 16274 6.3 True 2306.655 3225.400 2761.783 3493.289 2141.391 2823.6923 < 1 week Mozilla (-8,-6] < 16GB < 4GHz < 4 FALSE < 256
0159675e-15b0-4443-85b1-94de65455636 6 0 0.0946759 18.50000 10.6411113 1.166667 7 115.500 2.666667 3.166667 4.000 2427.250 1.1666667 0.1944444 33 29.938334 2 137 5 6 7 5234.000 2 beta 2019 17 False False False en-US beta 68 Google US -240 5 4 3292 256 Intel 8098 10.0 True 5100.212 6103.080 3665.146 6118.852 3707.192 3391.5286 < 1 month Mozilla (-6,-4] < 16GB < 4GHz < 4 FALSE < 256

Step 1 : Data Inspection

Training

To get introduced to our training dataset, let’s have a look on the basic information of the dataset.

rows columns discrete_columns continuous_columns all_missing_columns total_missing_values complete_rows total_observations memory_usage
302819 58 20 38 0 0 302819 17563502 135686992

Validation

To get introduced to our validation dataset, let’s have a look on the basic information of the dataset.

rows columns discrete_columns continuous_columns all_missing_columns total_missing_values complete_rows total_observations memory_usage
328042 58 20 38 0 0 328042 19026436 146987912

Observations

  • Most of the dataset is composed of continuous variables
  • No NAs values (were handled in preprocessing)

Data Structure

Training

Let’s use glimpse function to display a vertical preview of the training dataset. So we can easily preview data type and sample data.

glimpse(df_train_f)
## Observations: 302,819
## Variables: 58
## $ client_id                         <chr> "001cf926-92e3-4587-887e-d31...
## $ num_active_days                   <int> 8, 5, 2, 8, 4, 6, 8, 4, 3, 5...
## $ content_crashes                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ active_hours                      <dbl> 1.42152778, 0.83055556, 0.51...
## $ uri_count                         <dbl> 76.12500, 168.20000, 82.0000...
## $ session_length                    <dbl> 22.93374988, 2.43905560, 0.8...
## $ search_count                      <dbl> 1.875000, 0.800000, 2.500000...
## $ num_bookmarks                     <dbl> 11.00, 248.75, 9.00, 87.00, ...
## $ num_pages                         <dbl> 4014.5000, 20599.5000, 87.00...
## $ daily_unique_domains              <dbl> 6.062500, 7.866667, 2.166667...
## $ daily_max_tabs                    <dbl> 7.000000, 3.400000, 4.000000...
## $ daily_tabs_opened                 <dbl> 15.000000, 14.800000, 8.0000...
## $ startup_ms                        <dbl> 54176.2500, 3164.0667, 23977...
## $ daily_num_sessions_started        <dbl> 0.6250000, 1.2000000, 5.0000...
## $ active_hours_max                  <dbl> 3.3208333, 1.6708333, 0.8250...
## $ uri_count_max                     <int> 139, 325, 145, 210, 255, 323...
## $ session_length_max                <dbl> 37.007500, 8.318333, 1.46444...
## $ search_count_max                  <int> 6, 3, 5, 10, 1, 12, 29, 0, 0...
## $ num_pages_max                     <dbl> 15464.0, 20719.0, 87.0, 9044...
## $ daily_unique_domains_max          <dbl> 17.000000, 17.000000, 3.3333...
## $ daily_max_tabs_max                <int> 11, 4, 6, 12, 27, 18, 18, 2,...
## $ daily_tabs_opened_max             <int> 29, 33, 15, 37, 36, 35, 170,...
## $ startup_ms_max                    <dbl> 180095.000, 4966.000, 31918....
## $ daily_num_sessions_started_max    <int> 2, 2, 9, 2, 3, 1, 6, 1, 2, 3...
## $ label                             <fct> beta, beta, beta, beta, beta...
## $ install_year                      <dbl> 2016, 2016, 2019, 2018, 2018...
## $ profile_age                       <dbl> 1160, 1079, 745, 130, 293, 5...
## $ fxa_configured                    <fct> False, False, False, True, F...
## $ sync_configured                   <fct> False, False, False, True, F...
## $ is_default_browser                <fct> True, False, True, True, Fal...
## $ locale                            <fct> en-US, en-US, en-US, en-US, ...
## $ normalized_channel                <fct> beta, beta, beta, beta, beta...
## $ app_version                       <dbl> 67, 67, 67, 67, 67, 67, 67, ...
## $ default_search_engine             <fct> DuckDuckGo, DuckDuckGo, Goog...
## $ country                           <fct> US, GB, GB, US, US, GB, GB, ...
## $ timezone_offset                   <int> -240, 60, 60, -240, 360, 60,...
## $ num_addons                        <dbl> 8.00, 6.00, 6.00, 10.00, 7.0...
## $ cpu_cores                         <dbl> 2, 2, 4, 4, 2, 4, 2, 2, 2, 2...
## $ cpu_speed_mhz                     <dbl> 2527, 2394, 2394, 3991, 2659...
## $ cpu_l2_cache_kb                   <dbl> 256, 256, 256, 256, 3072, 25...
## $ cpu_vendor                        <fct> Intel, Intel, Intel, Intel, ...
## $ memory_mb                         <int> 4022, 3810, 8124, 16235, 331...
## $ os_version                        <ord> 6.1, 6.1, 10.0, 10.0, 10.0, ...
## $ is_wow64                          <fct> False, False, False, False, ...
## $ FX_PAGE_LOAD_MS_2_PARENT          <dbl> 4223.0885, 2148.3495, 2699.8...
## $ TIME_TO_DOM_COMPLETE_MS           <dbl> 5220.036, 2253.526, 2216.994...
## $ TIME_TO_DOM_CONTENT_LOADED_END_MS <dbl> 9079.1364, 1159.9791, 1832.6...
## $ TIME_TO_LOAD_EVENT_END_MS         <dbl> 5221.7525, 2146.8265, 2119.2...
## $ TIME_TO_DOM_INTERACTIVE_MS        <dbl> 6157.8399, 1155.0503, 1819.6...
## $ TIME_TO_NON_BLANK_PAINT_MS        <dbl> 5198.1300, 1015.9784, 1979.7...
## $ profile_age_cat                   <ord> < 5 years, < 5 years, < 5 ye...
## $ distro_id_norm                    <fct> Mozilla, Mozilla, Mozilla, M...
## $ timezone_cat                      <fct> "(-6,-4]", "(0,2]", "(0,2]",...
## $ memory_cat                        <ord> < 4GB, < 4GB, < 16GB, < 16GB...
## $ cpu_speed_cat                     <ord> < 3GHz, < 3GHz, < 3GHz, < 4G...
## $ cpu_cores_cat                     <ord> 2, 2, < 4, < 4, 2, < 4, 2, 2...
## $ is_release                        <lgl> FALSE, FALSE, FALSE, FALSE, ...
## $ cpu_l2_cache_kb_cat               <fct> < 256, < 256, < 256, < 256, ...

If we want to get some metrics about data types, zeros, infinite numbers, and missing values, we can use the df_status function.

kable(df_status(df_train_f, FALSE)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
client_id 0 0.00 0 0 0 0 character 302805
num_active_days 0 0.00 0 0 0 0 integer 8
content_crashes 302819 100.00 0 0 0 0 integer 1
active_hours 7 0.00 0 0 0 0 numeric 180711
uri_count 0 0.00 0 0 0 0 numeric 18610
session_length 0 0.00 0 0 0 0 numeric 289472
search_count 80328 26.53 0 0 0 0 numeric 930
num_bookmarks 472 0.16 0 0 0 0 numeric 19727
num_pages 7 0.00 0 0 0 0 numeric 179391
daily_unique_domains 0 0.00 0 0 0 0 numeric 66058
daily_max_tabs 0 0.00 0 0 0 0 numeric 2888
daily_tabs_opened 0 0.00 0 0 0 0 numeric 4645
startup_ms 0 0.00 0 0 0 0 numeric 276270
daily_num_sessions_started 3100 1.02 0 0 0 0 numeric 654
active_hours_max 7 0.00 0 0 0 0 numeric 39216
uri_count_max 0 0.00 0 0 0 0 integer 3642
session_length_max 0 0.00 0 0 0 0 numeric 188078
search_count_max 80328 26.53 0 0 0 0 integer 140
num_pages_max 7 0.00 0 0 0 0 numeric 76011
daily_unique_domains_max 0 0.00 0 0 0 0 numeric 1996
daily_max_tabs_max 0 0.00 0 0 0 0 integer 541
daily_tabs_opened_max 0 0.00 0 0 0 0 integer 853
startup_ms_max 0 0.00 0 0 0 0 numeric 145192
daily_num_sessions_started_max 3100 1.02 0 0 0 0 integer 85
label 0 0.00 0 0 0 0 factor 2
install_year 0 0.00 0 0 0 0 numeric 21
profile_age 6320 2.09 0 0 0 0 numeric 4127
fxa_configured 0 0.00 0 0 0 0 factor 2
sync_configured 0 0.00 0 0 0 0 factor 2
is_default_browser 0 0.00 0 0 0 0 factor 2
locale 0 0.00 0 0 0 0 factor 2
normalized_channel 0 0.00 0 0 0 0 factor 2
app_version 0 0.00 0 0 0 0 numeric 1
default_search_engine 0 0.00 0 0 0 0 factor 6
country 0 0.00 0 0 0 0 factor 2
timezone_offset 736 0.24 0 0 0 0 integer 35
num_addons 53 0.02 0 0 0 0 numeric 2124
cpu_cores 0 0.00 0 0 0 0 numeric 27
cpu_speed_mhz 0 0.00 0 0 0 0 numeric 1232
cpu_l2_cache_kb 0 0.00 0 0 0 0 numeric 8
cpu_vendor 0 0.00 0 0 0 0 factor 3
memory_mb 0 0.00 0 0 0 0 integer 5893
os_version 0 0.00 0 0 0 0 ordered-factor 5
is_wow64 0 0.00 0 0 0 0 factor 2
FX_PAGE_LOAD_MS_2_PARENT 0 0.00 0 0 0 0 numeric 294778
TIME_TO_DOM_COMPLETE_MS 0 0.00 0 0 0 0 numeric 300964
TIME_TO_DOM_CONTENT_LOADED_END_MS 0 0.00 0 0 0 0 numeric 300503
TIME_TO_LOAD_EVENT_END_MS 0 0.00 0 0 0 0 numeric 301045
TIME_TO_DOM_INTERACTIVE_MS 0 0.00 0 0 0 0 numeric 300036
TIME_TO_NON_BLANK_PAINT_MS 0 0.00 0 0 0 0 numeric 296802
profile_age_cat 0 0.00 0 0 0 0 ordered-factor 6
distro_id_norm 0 0.00 0 0 0 0 factor 4
timezone_cat 0 0.00 0 0 0 0 factor 13
memory_cat 0 0.00 0 0 0 0 ordered-factor 6
cpu_speed_cat 0 0.00 0 0 0 0 ordered-factor 5
cpu_cores_cat 0 0.00 0 0 0 0 ordered-factor 6
is_release 59627 19.69 0 0 0 0 logical 2
cpu_l2_cache_kb_cat 0 0.00 0 0 0 0 factor 4
  • q_zeros: quantity of zeros (p_zeros: in percent)
  • q_inf: quantity of infinite values (p_inf: in percent)
  • q_na: quantity of NA (p_na: in percent)
  • type: factor, ordered-factor, numeric, integer or character
  • unique: quantity of unique values

Validation

Let’s use glimpse function to display a vertical preview of the validation dataset. So we can easily preview data type and sample data.

glimpse(df_validate_f)
## Observations: 328,042
## Variables: 58
## $ client_id                         <chr> "001cf926-92e3-4587-887e-d31...
## $ num_active_days                   <int> 8, 2, 2, 3, 3, 6, 1, 4, 7, 4...
## $ content_crashes                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ active_hours                      <dbl> 1.53697917, 0.18333333, 0.24...
## $ uri_count                         <dbl> 71.125000, 43.500000, 89.000...
## $ session_length                    <dbl> 23.6797916, 0.5619445, 6.629...
## $ search_count                      <dbl> 3.3750000, 1.5000000, 0.0000...
## $ num_bookmarks                     <dbl> 11.0, 259.0, 15.0, 11.0, 7.0...
## $ num_pages                         <dbl> 1890.5714, 22005.0000, 7203....
## $ daily_unique_domains              <dbl> 9.216667, 5.000000, 1.000000...
## $ daily_max_tabs                    <dbl> 8.500000, 3.000000, 6.500000...
## $ daily_tabs_opened                 <dbl> 16.625, 6.000, 9.500, 1.000,...
## $ startup_ms                        <dbl> 9928.4833, 5413.5000, 7041.6...
## $ daily_num_sessions_started        <dbl> 0.6250000, 1.0000000, 1.0000...
## $ active_hours_max                  <dbl> 2.32500000, 0.21527778, 0.32...
## $ uri_count_max                     <int> 120, 48, 99, 53, 19, 33, 32,...
## $ session_length_max                <dbl> 34.020000, 0.781111, 9.95972...
## $ search_count_max                  <int> 6, 2, 0, 0, 0, 2, 0, 5, 1, 1...
## $ num_pages_max                     <dbl> 2094, 22005, 7203, 2112, 18,...
## $ daily_unique_domains_max          <dbl> 20.000000, 7.000000, 1.00000...
## $ daily_max_tabs_max                <int> 14, 4, 11, 2, 2, 6, 8, 3, 6,...
## $ daily_tabs_opened_max             <int> 31, 7, 12, 1, 2, 7, 13, 71, ...
## $ startup_ms_max                    <dbl> 17491.667, 9579.000, 9194.33...
## $ daily_num_sessions_started_max    <int> 3, 1, 2, 2, 1, 2, 2, 2, 2, 1...
## $ label                             <fct> beta, beta, beta, beta, beta...
## $ install_year                      <dbl> 2016, 2016, 2018, 2013, 2017...
## $ profile_age                       <dbl> 1204, 1124, 336, 1606, 6, 17...
## $ fxa_configured                    <fct> False, False, False, False, ...
## $ sync_configured                   <fct> False, False, False, False, ...
## $ is_default_browser                <fct> True, False, False, True, Fa...
## $ locale                            <fct> en-US, en-US, en-US, en-US, ...
## $ normalized_channel                <fct> beta, beta, beta, beta, beta...
## $ app_version                       <dbl> 68, 68, 68, 68, 68, 68, 68, ...
## $ default_search_engine             <fct> DuckDuckGo, DuckDuckGo, Goog...
## $ country                           <fct> US, GB, US, GB, US, US, US, ...
## $ timezone_offset                   <int> -240, 60, 360, 360, -420, -2...
## $ num_addons                        <dbl> 7.0, 5.0, 6.0, 6.0, 5.0, 5.0...
## $ cpu_cores                         <dbl> 2, 2, 2, 2, 4, 4, 1, 2, 4, 3...
## $ cpu_speed_mhz                     <dbl> 2527, 2394, 2659, 2594, 3093...
## $ cpu_l2_cache_kb                   <dbl> 256, 256, 3072, 256, 256, 25...
## $ cpu_vendor                        <fct> Intel, Intel, Intel, Intel, ...
## $ memory_mb                         <int> 4022, 3810, 3317, 3965, 1627...
## $ os_version                        <ord> 6.1, 6.1, 10.0, 6.2, 6.3, 10...
## $ is_wow64                          <fct> False, False, False, False, ...
## $ FX_PAGE_LOAD_MS_2_PARENT          <dbl> 3133.947, 3226.048, 4400.155...
## $ TIME_TO_DOM_COMPLETE_MS           <dbl> 3713.308, 2561.596, 7244.930...
## $ TIME_TO_DOM_CONTENT_LOADED_END_MS <dbl> 3748.4715, 1346.8361, 3711.4...
## $ TIME_TO_LOAD_EVENT_END_MS         <dbl> 3730.944, 2523.810, 7280.457...
## $ TIME_TO_DOM_INTERACTIVE_MS        <dbl> 2444.7985, 1385.3500, 3929.2...
## $ TIME_TO_NON_BLANK_PAINT_MS        <dbl> 1972.6632, 935.6222, 1474.56...
## $ profile_age_cat                   <ord> < 5 years, < 5 years, < 2 ye...
## $ distro_id_norm                    <fct> Mozilla, Mozilla, Mozilla, M...
## $ timezone_cat                      <fct> "(-6,-4]", "(0,2]", "(4,6]",...
## $ memory_cat                        <ord> < 4GB, < 4GB, < 4GB, < 4GB, ...
## $ cpu_speed_cat                     <ord> < 3GHz, < 3GHz, < 3GHz, < 3G...
## $ cpu_cores_cat                     <ord> 2, 2, 2, 2, < 4, < 4, 1, 2, ...
## $ is_release                        <lgl> FALSE, FALSE, FALSE, FALSE, ...
## $ cpu_l2_cache_kb_cat               <fct> < 256, < 256, > 1024, < 256,...

If we want to get some metrics about data types, zeros, infinite numbers, and missing values, we can use the df_status function.

kable(df_status(df_train_f, FALSE)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
variable q_zeros p_zeros q_na p_na q_inf p_inf type unique
client_id 0 0.00 0 0 0 0 character 302805
num_active_days 0 0.00 0 0 0 0 integer 8
content_crashes 302819 100.00 0 0 0 0 integer 1
active_hours 7 0.00 0 0 0 0 numeric 180711
uri_count 0 0.00 0 0 0 0 numeric 18610
session_length 0 0.00 0 0 0 0 numeric 289472
search_count 80328 26.53 0 0 0 0 numeric 930
num_bookmarks 472 0.16 0 0 0 0 numeric 19727
num_pages 7 0.00 0 0 0 0 numeric 179391
daily_unique_domains 0 0.00 0 0 0 0 numeric 66058
daily_max_tabs 0 0.00 0 0 0 0 numeric 2888
daily_tabs_opened 0 0.00 0 0 0 0 numeric 4645
startup_ms 0 0.00 0 0 0 0 numeric 276270
daily_num_sessions_started 3100 1.02 0 0 0 0 numeric 654
active_hours_max 7 0.00 0 0 0 0 numeric 39216
uri_count_max 0 0.00 0 0 0 0 integer 3642
session_length_max 0 0.00 0 0 0 0 numeric 188078
search_count_max 80328 26.53 0 0 0 0 integer 140
num_pages_max 7 0.00 0 0 0 0 numeric 76011
daily_unique_domains_max 0 0.00 0 0 0 0 numeric 1996
daily_max_tabs_max 0 0.00 0 0 0 0 integer 541
daily_tabs_opened_max 0 0.00 0 0 0 0 integer 853
startup_ms_max 0 0.00 0 0 0 0 numeric 145192
daily_num_sessions_started_max 3100 1.02 0 0 0 0 integer 85
label 0 0.00 0 0 0 0 factor 2
install_year 0 0.00 0 0 0 0 numeric 21
profile_age 6320 2.09 0 0 0 0 numeric 4127
fxa_configured 0 0.00 0 0 0 0 factor 2
sync_configured 0 0.00 0 0 0 0 factor 2
is_default_browser 0 0.00 0 0 0 0 factor 2
locale 0 0.00 0 0 0 0 factor 2
normalized_channel 0 0.00 0 0 0 0 factor 2
app_version 0 0.00 0 0 0 0 numeric 1
default_search_engine 0 0.00 0 0 0 0 factor 6
country 0 0.00 0 0 0 0 factor 2
timezone_offset 736 0.24 0 0 0 0 integer 35
num_addons 53 0.02 0 0 0 0 numeric 2124
cpu_cores 0 0.00 0 0 0 0 numeric 27
cpu_speed_mhz 0 0.00 0 0 0 0 numeric 1232
cpu_l2_cache_kb 0 0.00 0 0 0 0 numeric 8
cpu_vendor 0 0.00 0 0 0 0 factor 3
memory_mb 0 0.00 0 0 0 0 integer 5893
os_version 0 0.00 0 0 0 0 ordered-factor 5
is_wow64 0 0.00 0 0 0 0 factor 2
FX_PAGE_LOAD_MS_2_PARENT 0 0.00 0 0 0 0 numeric 294778
TIME_TO_DOM_COMPLETE_MS 0 0.00 0 0 0 0 numeric 300964
TIME_TO_DOM_CONTENT_LOADED_END_MS 0 0.00 0 0 0 0 numeric 300503
TIME_TO_LOAD_EVENT_END_MS 0 0.00 0 0 0 0 numeric 301045
TIME_TO_DOM_INTERACTIVE_MS 0 0.00 0 0 0 0 numeric 300036
TIME_TO_NON_BLANK_PAINT_MS 0 0.00 0 0 0 0 numeric 296802
profile_age_cat 0 0.00 0 0 0 0 ordered-factor 6
distro_id_norm 0 0.00 0 0 0 0 factor 4
timezone_cat 0 0.00 0 0 0 0 factor 13
memory_cat 0 0.00 0 0 0 0 ordered-factor 6
cpu_speed_cat 0 0.00 0 0 0 0 ordered-factor 5
cpu_cores_cat 0 0.00 0 0 0 0 ordered-factor 6
is_release 59627 19.69 0 0 0 0 logical 2
cpu_l2_cache_kb_cat 0 0.00 0 0 0 0 factor 4
  • q_zeros: quantity of zeros (p_zeros: in percent)
  • q_inf: quantity of infinite values (p_inf: in percent)
  • q_na: quantity of NA (p_na: in percent)
  • type: factor, ordered-factor, numeric, integer or character
  • unique: quantity of unique values

Observations

Are all the variables in the correct data type?

None. It seems that this has already been dealt with in preprocessing.

Any variables with lots of zeros?

Yes. Variables with lots of zeros may not be useful for modeling and, in some cases, they may dramatically bias the model. For example, the content_crashes is 100% equal to zero.

Any variables with lots of NAs?

None. Good news.

Any high cardinality variable?

Factor/categorical variables with a high number of different values (~30) tend to do overfitting if the categories have low cardinality.

Step 2 : Beta vs Release

Training

df_release <- df_train_f[which(df_train_f$label == 'release'), ]
df_beta <- df_train_f[which(df_train_f$label == 'beta'), ]

f <- freq(df_train_f$label)

summary(df_release)
##   client_id         num_active_days content_crashes  active_hours   
##  Length:243192      Min.   :1.00    Min.   :0       Min.   :0.0000  
##  Class :character   1st Qu.:4.00    1st Qu.:0       1st Qu.:0.2686  
##  Mode  :character   Median :6.00    Median :0       Median :0.5744  
##                     Mean   :5.57    Mean   :0       Mean   :0.8469  
##                     3rd Qu.:8.00    3rd Qu.:0       3rd Qu.:1.1266  
##                     Max.   :8.00    Max.   :0       Max.   :7.1222  
##                                                                     
##    uri_count       session_length      search_count    num_bookmarks     
##  Min.   :   1.00   Min.   : 0.01926   Min.   : 0.000   Min.   :    0.00  
##  1st Qu.:  44.33   1st Qu.: 2.15904   1st Qu.: 0.000   1st Qu.:   10.00  
##  Median :  96.67   Median : 6.33512   Median : 0.875   Median :   26.00  
##  Mean   : 156.24   Mean   : 9.28218   Mean   : 2.377   Mean   :  158.94  
##  3rd Qu.: 197.00   3rd Qu.:13.66085   3rd Qu.: 3.000   3rd Qu.:   85.21  
##  Max.   :2391.25   Max.   :91.06639   Max.   :45.750   Max.   :18632.00  
##                                                                          
##    num_pages      daily_unique_domains daily_max_tabs    daily_tabs_opened
##  Min.   :     0   Min.   : 1.000       Min.   :  0.625   Min.   :  1.000  
##  1st Qu.:  1022   1st Qu.: 2.283       1st Qu.:  2.500   1st Qu.:  4.000  
##  Median :  5536   Median : 3.600       Median :  3.714   Median :  8.833  
##  Mean   : 17331   Mean   : 4.968       Mean   :  6.200   Mean   : 17.093  
##  3rd Qu.: 19681   3rd Qu.: 6.071       3rd Qu.:  6.000   3rd Qu.: 19.167  
##  Max.   :168416   Max.   :39.375       Max.   :445.375   Max.   :347.500  
##                                                                           
##    startup_ms      daily_num_sessions_started active_hours_max 
##  Min.   :    261   Min.   : 0.000             Min.   : 0.0000  
##  1st Qu.:   1433   1st Qu.: 1.250             1st Qu.: 0.5403  
##  Median :   3231   Median : 2.000             Median : 1.1542  
##  Mean   :   9832   Mean   : 2.889             Mean   : 1.6251  
##  3rd Qu.:   8395   3rd Qu.: 3.500             3rd Qu.: 2.1903  
##  Max.   :5358123   Max.   :32.250             Max.   :23.9667  
##                                                                
##  uri_count_max     session_length_max search_count_max  num_pages_max   
##  Min.   :    1.0   Min.   :  0.0306   Min.   :  0.000   Min.   :     0  
##  1st Qu.:   86.0   1st Qu.:  4.4085   1st Qu.:  0.000   1st Qu.:  1142  
##  Median :  196.0   Median : 11.7017   Median :  2.000   Median :  5706  
##  Mean   :  321.4   Mean   : 18.2107   Mean   :  5.434   Mean   : 17519  
##  3rd Qu.:  400.0   3rd Qu.: 26.1284   3rd Qu.:  7.000   3rd Qu.: 19922  
##  Max.   :18032.0   Max.   :384.2883   Max.   :217.000   Max.   :172543  
##                                                                         
##  daily_unique_domains_max daily_max_tabs_max daily_tabs_opened_max
##  Min.   :  1.000          Min.   :   1.000   Min.   :   1.00      
##  1st Qu.:  3.125          1st Qu.:   4.000   1st Qu.:   7.00      
##  Median :  6.000          Median :   6.000   Median :  17.00      
##  Mean   :  8.552          Mean   :   9.318   Mean   :  33.27      
##  3rd Qu.: 11.000          3rd Qu.:   9.000   3rd Qu.:  38.00      
##  Max.   :100.000          Max.   :2425.000   Max.   :2410.00      
##                                                                   
##  startup_ms_max     daily_num_sessions_started_max     label       
##  Min.   :     271   Min.   :  0.000                beta   :     0  
##  1st Qu.:    2310   1st Qu.:  2.000                release:243192  
##  Median :    5695   Median :  4.000                                
##  Mean   :   25484   Mean   :  5.249                                
##  3rd Qu.:   16705   3rd Qu.:  6.000                                
##  Max.   :39562826   Max.   :100.000                                
##                                                                    
##   install_year   profile_age     fxa_configured sync_configured
##  Min.   :2000   Min.   :   0.0   False:197466   False:194974   
##  1st Qu.:2016   1st Qu.: 257.0   True : 45726   True : 48218   
##  Median :2018   Median : 698.0                                 
##  Mean   :2017   Mean   : 894.7                                 
##  3rd Qu.:2018   3rd Qu.:1374.0                                 
##  Max.   :2019   Max.   :6922.0                                 
##                                                                
##  is_default_browser   locale       normalized_channel  app_version
##  False:101700       en-GB: 23966   beta   :     0     Min.   :67  
##  True :141492       en-US:219226   release:243192     1st Qu.:67  
##                                                       Median :67  
##                                                       Mean   :67  
##                                                       3rd Qu.:67  
##                                                       Max.   :67  
##                                                                   
##          default_search_engine country     timezone_offset 
##  Bing               :  4442    GB: 36767   Min.   :-720.0  
##  DuckDuckGo         :  8328    US:206425   1st Qu.:-300.0  
##  Google             :203781                Median :-240.0  
##  other (bundled)    :   737                Mean   :-238.7  
##  other (non-bundled): 25549                3rd Qu.:-240.0  
##  Yahoo              :   355                Max.   : 720.0  
##                                                            
##    num_addons       cpu_cores      cpu_speed_mhz   cpu_l2_cache_kb
##  Min.   : 0.000   Min.   : 1.000   Min.   :  792   Min.   : 128   
##  1st Qu.: 4.000   1st Qu.: 2.000   1st Qu.: 2261   1st Qu.: 256   
##  Median : 5.000   Median : 2.000   Median : 2712   Median : 256   
##  Mean   : 5.652   Mean   : 3.143   Mean   : 2711   Mean   : 626   
##  3rd Qu.: 6.000   3rd Qu.: 4.000   3rd Qu.: 3193   3rd Qu.: 512   
##  Max.   :61.000   Max.   :40.000   Max.   :15077   Max.   :6144   
##                                                                   
##  cpu_vendor       memory_mb      os_version      is_wow64     
##  AMD  : 36898   Min.   :   512   Other:    15   False:230560  
##  Intel:206286   1st Qu.:  4011   6.1  : 65588   True : 12632  
##  Other:     8   Median :  8069   6.2  :  1756                 
##                 Mean   :  9444   6.3  : 13121                 
##                 3rd Qu.: 12144   10.0 :162712                 
##                 Max.   :524254                                
##                                                               
##  FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS
##  Min.   :    3.814        Min.   :   15          
##  1st Qu.: 1900.609        1st Qu.: 1728          
##  Median : 2657.412        Median : 2498          
##  Mean   : 3030.073        Mean   : 3291          
##  3rd Qu.: 3797.899        3rd Qu.: 3878          
##  Max.   :10000.000        Max.   :50000          
##                                                  
##  TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS
##  Min.   :   15.62                  Min.   :   15            
##  1st Qu.: 1120.26                  1st Qu.: 1614            
##  Median : 1629.36                  Median : 2300            
##  Mean   : 2290.86                  Mean   : 3017            
##  3rd Qu.: 2592.47                  3rd Qu.: 3534            
##  Max.   :44262.02                  Max.   :50000            
##                                                             
##  TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS   profile_age_cat 
##  Min.   :   22.89           Min.   :    27.0           < 1 week  : 7613  
##  1st Qu.:  978.79           1st Qu.:   777.2           < 1 month : 6802  
##  Median : 1374.34           Median :  1095.7           < 6 months:32979  
##  Mean   : 1796.62           Mean   :  1445.7           < 2 years :77214  
##  3rd Qu.: 2079.06           3rd Qu.:  1651.2           < 5 years :86054  
##  Max.   :34688.55           Max.   :100000.0           > 5 years :32530  
##                                                                          
##  distro_id_norm      timezone_cat     memory_cat     cpu_speed_cat   
##  acer   :  2619   (-6,-4]  :146732   < 1GB :   346   < 1GHz :   452  
##  Mozilla:237022   (-8,-6]  : 54613   < 2GB :  5534   < 2GHz : 38361  
##  other  :  2351   (0,2]    : 35955   < 4GB : 66120   < 3GHz :123428  
##  Yahoo  :  1200   (4,6]    :  2165   < 6GB : 15982   < 4GHz : 76780  
##                   [-12,-10]:   958   < 16GB:144433   > 16GHz:  4171  
##                   (6,8]    :   870   > 16GB: 10777                   
##                   (Other)  :  1899                                   
##  cpu_cores_cat is_release     cpu_l2_cache_kb_cat
##  1   :  3122   Mode:logical   < 1024: 22370      
##  2   :120526   TRUE:243192    < 256 :179679      
##  < 4 :104846                  < 512 : 13262      
##  < 8 : 13778                  > 1024: 27881      
##  < 16:   745                                     
##  > 16:   175                                     
## 
summary(df_beta)
##   client_id         num_active_days content_crashes  active_hours   
##  Length:59627       Min.   :1.000   Min.   :0       Min.   :0.0000  
##  Class :character   1st Qu.:4.000   1st Qu.:0       1st Qu.:0.2250  
##  Mode  :character   Median :6.000   Median :0       Median :0.5310  
##                     Mean   :5.346   Mean   :0       Mean   :0.8237  
##                     3rd Qu.:8.000   3rd Qu.:0       3rd Qu.:1.1029  
##                     Max.   :8.000   Max.   :0       Max.   :7.2901  
##                                                                     
##    uri_count       session_length       search_count     num_bookmarks    
##  Min.   :   1.00   Min.   :  0.01667   Min.   : 0.0000   Min.   :    0.0  
##  1st Qu.:  37.00   1st Qu.:  2.52840   1st Qu.: 0.0000   1st Qu.:   10.0  
##  Median :  86.67   Median :  7.71056   Median : 0.8333   Median :   26.0  
##  Mean   : 152.75   Mean   : 12.29620   Mean   : 2.4506   Mean   :  242.5  
##  3rd Qu.: 188.50   3rd Qu.: 19.57926   3rd Qu.: 2.8333   3rd Qu.:   96.0  
##  Max.   :2931.00   Max.   :240.80486   Max.   :51.0000   Max.   :40401.0  
##                                                                           
##    num_pages      daily_unique_domains daily_max_tabs    
##  Min.   :     1   Min.   : 1.000       Min.   :   1.000  
##  1st Qu.:   686   1st Qu.: 2.167       1st Qu.:   2.600  
##  Median :  4186   Median : 3.562       Median :   4.250  
##  Mean   : 17363   Mean   : 5.060       Mean   :   9.604  
##  3rd Qu.: 18605   3rd Qu.: 6.167       3rd Qu.:   8.000  
##  Max.   :179658   Max.   :44.000       Max.   :1012.625  
##                                                          
##  daily_tabs_opened   startup_ms       daily_num_sessions_started
##  Min.   :  1.00    Min.   :     269   Min.   : 0.000            
##  1st Qu.:  4.00    1st Qu.:    2102   1st Qu.: 1.000            
##  Median :  9.00    Median :    5088   Median : 1.667            
##  Mean   : 20.49    Mean   :   25836   Mean   : 2.369            
##  3rd Qu.: 21.75    3rd Qu.:   12619   3rd Qu.: 2.875            
##  Max.   :518.25    Max.   :17109506   Max.   :32.833            
##                                                                 
##  active_hours_max uri_count_max   session_length_max  search_count_max 
##  Min.   : 0.000   Min.   :    1   Min.   :   0.0214   Min.   :  0.000  
##  1st Qu.: 0.450   1st Qu.:   68   1st Qu.:   4.8778   1st Qu.:  0.000  
##  Median : 1.064   Median :  172   Median :  14.8089   Median :  2.000  
##  Mean   : 1.578   Mean   :  311   Mean   :  22.7066   Mean   :  5.636  
##  3rd Qu.: 2.165   3rd Qu.:  382   3rd Qu.:  31.5190   3rd Qu.:  7.000  
##  Max.   :24.983   Max.   :15626   Max.   :1255.3822   Max.   :188.000  
##                                                                        
##  num_pages_max    daily_unique_domains_max daily_max_tabs_max
##  Min.   :     1   Min.   :  1.000          Min.   :   1.00   
##  1st Qu.:   785   1st Qu.:  3.000          1st Qu.:   4.00   
##  Median :  4340   Median :  5.500          Median :   6.00   
##  Mean   : 17559   Mean   :  8.744          Mean   :  13.81   
##  3rd Qu.: 18886   3rd Qu.: 11.000          3rd Qu.:  12.00   
##  Max.   :180456   Max.   :100.000          Max.   :3149.00   
##                                                              
##  daily_tabs_opened_max startup_ms_max      daily_num_sessions_started_max
##  Min.   :   1.00       Min.   :      269   Min.   : 0.000                
##  1st Qu.:   6.00       1st Qu.:     3186   1st Qu.: 2.000                
##  Median :  17.00       Median :     8389   Median : 3.000                
##  Mean   :  39.65       Mean   :    86290   Mean   : 4.281                
##  3rd Qu.:  42.00       3rd Qu.:    23149   3rd Qu.: 5.000                
##  Max.   :3302.00       Max.   :106338296   Max.   :88.000                
##                                                                          
##      label        install_year   profile_age     fxa_configured
##  beta   :59627   Min.   :1993   Min.   :   0.0   False:51826   
##  release:    0   1st Qu.:2017   1st Qu.: 271.0   True : 7801   
##                  Median :2018   Median : 711.0                 
##                  Mean   :2017   Mean   : 893.8                 
##                  3rd Qu.:2018   3rd Qu.:1354.0                 
##                  Max.   :2019   Max.   :7051.0                 
##                                                                
##  sync_configured is_default_browser   locale      normalized_channel
##  False:51267     False:26001        en-GB: 2581   beta   :59627     
##  True : 8360     True :33626        en-US:57046   release:    0     
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##   app_version         default_search_engine country    timezone_offset 
##  Min.   :67   Bing               :  833     GB:17333   Min.   :-720.0  
##  1st Qu.:67   DuckDuckGo         : 2630     US:42294   1st Qu.:-300.0  
##  Median :67   Google             :52211                Median :-240.0  
##  Mean   :67   other (bundled)    :  125                Mean   :-143.9  
##  3rd Qu.:67   other (non-bundled): 3828                3rd Qu.:  60.0  
##  Max.   :67   Yahoo              :    0                Max.   : 840.0  
##                                                                        
##    num_addons        cpu_cores      cpu_speed_mhz   cpu_l2_cache_kb 
##  Min.   :  0.000   Min.   : 1.000   Min.   :  798   Min.   : 128.0  
##  1st Qu.:  6.000   1st Qu.: 2.000   1st Qu.: 2200   1st Qu.: 256.0  
##  Median :  7.000   Median : 2.000   Median : 2594   Median : 256.0  
##  Mean   :  7.855   Mean   : 2.976   Mean   : 2678   Mean   : 679.9  
##  3rd Qu.:  8.000   3rd Qu.: 4.000   3rd Qu.: 3192   3rd Qu.: 512.0  
##  Max.   :170.000   Max.   :36.000   Max.   :37214   Max.   :6144.0  
##                                                                     
##  cpu_vendor      memory_mb      os_version     is_wow64    
##  AMD  : 8757   Min.   :   511   Other:   16   False:44733  
##  Intel:50820   1st Qu.:  3984   6.1  :17123   True :14894  
##  Other:   50   Median :  8031   6.2  :  674                
##                Mean   :  8965   6.3  : 3785                
##                3rd Qu.: 10238   10.0 :38029                
##                Max.   :262078                              
##                                                            
##  FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS
##  Min.   :    1            Min.   :    7          
##  1st Qu.: 2028            1st Qu.: 1894          
##  Median : 2952            Median : 2918          
##  Mean   : 3464            Mean   : 4389          
##  3rd Qu.: 4487            3rd Qu.: 5106          
##  Max.   :10000            Max.   :50000          
##                                                  
##  TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS
##  Min.   :   18.17                  Min.   :    7            
##  1st Qu.: 1217.61                  1st Qu.: 1796            
##  Median : 1856.04                  Median : 2739            
##  Mean   : 2737.64                  Mean   : 4127            
##  3rd Qu.: 3185.18                  3rd Qu.: 4758            
##  Max.   :50000.00                  Max.   :50000            
##                                                             
##  TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS   profile_age_cat 
##  Min.   :   40.84           Min.   :    7.0            < 1 week  : 1590  
##  1st Qu.: 1081.05           1st Qu.:  845.6            < 1 month : 2485  
##  Median : 1618.02           Median : 1251.7            < 6 months: 8173  
##  Mean   : 2404.39           Mean   : 1833.7            < 2 years :18155  
##  3rd Qu.: 2738.66           3rd Qu.: 2052.5            < 5 years :21807  
##  Max.   :50000.00           Max.   :90081.0            > 5 years : 7417  
##                                                                          
##  distro_id_norm   timezone_cat    memory_cat    cpu_speed_cat  
##  acer   :    3   (-6,-4]:26166   < 1GB :  378   < 1GHz :  157  
##  Mozilla:59615   (0,2]  :15927   < 2GB : 2924   < 2GHz : 9968  
##  other  :    7   (-8,-6]:10863   < 4GB :18788   < 3GHz :30935  
##  Yahoo  :    2   (4,6]  : 3244   < 6GB : 3558   < 4GHz :17258  
##                  (6,8]  : 1410   < 16GB:31147   > 16GHz: 1309  
##                  (2,4]  : 1119   > 16GB: 2832                  
##                  (Other):  898                                 
##  cpu_cores_cat is_release      cpu_l2_cache_kb_cat
##  1   : 1224    Mode :logical   < 1024: 6014       
##  2   :32916    FALSE:59627     < 256 :42540       
##  < 4 :22776                    < 512 : 3321       
##  < 8 : 2507                    > 1024: 7752       
##  < 16:  180                                       
##  > 16:   24                                       
## 

Validation

df_v_release <- df_validate_f[which(df_validate_f$label == 'release'), ]
df_v_beta <- df_validate_f[which(df_validate_f$label == 'beta'), ]

f <- freq(df_validate_f$label)

summary(df_v_release)
##   client_id         num_active_days content_crashes  active_hours   
##  Length:257697      Min.   :1.00    Min.   :0       Min.   :0.0000  
##  Class :character   1st Qu.:4.00    1st Qu.:0       1st Qu.:0.2641  
##  Mode  :character   Median :6.00    Median :0       Median :0.5752  
##                     Mean   :5.71    Mean   :0       Mean   :0.8525  
##                     3rd Qu.:8.00    3rd Qu.:0       3rd Qu.:1.1399  
##                     Max.   :8.00    Max.   :0       Max.   :7.2205  
##                                                                     
##    uri_count       session_length      search_count    num_bookmarks     
##  Min.   :   1.00   Min.   : 0.01572   Min.   : 0.000   Min.   :    0.00  
##  1st Qu.:  44.00   1st Qu.: 2.31995   1st Qu.: 0.000   1st Qu.:   10.00  
##  Median :  97.43   Median : 6.78083   Median : 1.000   Median :   25.33  
##  Mean   : 158.72   Mean   : 9.70679   Mean   : 2.446   Mean   :  158.03  
##  3rd Qu.: 200.14   3rd Qu.:14.77410   3rd Qu.: 3.000   3rd Qu.:   84.00  
##  Max.   :2483.17   Max.   :90.44222   Max.   :45.000   Max.   :20002.14  
##                                                                          
##    num_pages        daily_unique_domains daily_max_tabs    
##  Min.   :     0.0   Min.   : 1.000       Min.   :  0.5714  
##  1st Qu.:   991.3   1st Qu.: 2.287       1st Qu.:  2.5000  
##  Median :  5308.0   Median : 3.651       Median :  3.8000  
##  Mean   : 17089.9   Mean   : 5.112       Mean   :  6.3472  
##  3rd Qu.: 19359.8   3rd Qu.: 6.250       3rd Qu.:  6.2857  
##  Max.   :168812.1   Max.   :42.400       Max.   :449.3333  
##                                                            
##  daily_tabs_opened   startup_ms       daily_num_sessions_started
##  Min.   :  1.000   Min.   :     239   Min.   : 0.000            
##  1st Qu.:  4.000   1st Qu.:    1567   1st Qu.: 1.167            
##  Median :  8.857   Median :    3346   Median : 2.000            
##  Mean   : 17.187   Mean   :   27279   Mean   : 2.831            
##  3rd Qu.: 19.500   3rd Qu.:    7770   3rd Qu.: 3.375            
##  Max.   :357.000   Max.   :22594812   Max.   :32.250            
##                                                                 
##  active_hours_max  uri_count_max     session_length_max search_count_max 
##  Min.   : 0.0000   Min.   :    1.0   Min.   :  0.0197   Min.   :  0.000  
##  1st Qu.: 0.5375   1st Qu.:   86.0   1st Qu.:  4.7883   1st Qu.:  0.000  
##  Median : 1.1653   Median :  199.0   Median : 12.8028   Median :  3.000  
##  Mean   : 1.6370   Mean   :  328.4   Mean   : 18.6141   Mean   :  5.634  
##  3rd Qu.: 2.2208   3rd Qu.:  409.0   3rd Qu.: 27.0975   3rd Qu.:  7.000  
##  Max.   :25.4403   Max.   :18524.0   Max.   :524.5456   Max.   :208.000  
##                                                                          
##  num_pages_max    daily_unique_domains_max daily_max_tabs_max
##  Min.   :     0   Min.   :  1.000          Min.   :   1.00   
##  1st Qu.:  1113   1st Qu.:  3.200          1st Qu.:   4.00   
##  Median :  5490   Median :  6.000          Median :   6.00   
##  Mean   : 17289   Mean   :  8.837          Mean   :   9.54   
##  3rd Qu.: 19606   3rd Qu.: 11.000          3rd Qu.:  10.00   
##  Max.   :170532   Max.   :100.000          Max.   :2215.00   
##                                                              
##  daily_tabs_opened_max startup_ms_max      daily_num_sessions_started_max
##  Min.   :   1.00       Min.   :      239   Min.   :  0.00                
##  1st Qu.:   7.00       1st Qu.:     2618   1st Qu.:  2.00                
##  Median :  17.00       Median :     5997   Median :  4.00                
##  Mean   :  33.54       Mean   :    95160   Mean   :  5.18                
##  3rd Qu.:  38.00       3rd Qu.:    14981   3rd Qu.:  6.00                
##  Max.   :2342.00       Max.   :171712978   Max.   :184.00                
##                                                                          
##      label         install_year   profile_age     fxa_configured
##  beta   :     0   Min.   :2000   Min.   :   0.0   False:207292  
##  release:257697   1st Qu.:2017   1st Qu.: 235.0   True : 50405  
##                   Median :2018   Median : 673.0                 
##                   Mean   :2017   Mean   : 883.9                 
##                   3rd Qu.:2019   3rd Qu.:1368.0                 
##                   Max.   :2019   Max.   :6972.0                 
##                                                                 
##  sync_configured is_default_browser   locale       normalized_channel
##  False:205539    False:108253       en-GB: 24771   beta   :     0    
##  True : 52158    True :149444       en-US:232926   release:257697    
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##   app_version         default_search_engine country     timezone_offset 
##  Min.   :68   Bing               :  5142    GB: 37712   Min.   :-720.0  
##  1st Qu.:68   DuckDuckGo         :  9598    US:219985   1st Qu.:-300.0  
##  Median :68   Google             :225161                Median :-240.0  
##  Mean   :68   missing            :    33                Mean   :-240.5  
##  3rd Qu.:68   other (bundled)    :   682                3rd Qu.:-240.0  
##  Max.   :68   other (non-bundled): 16655                Max.   : 780.0  
##               Yahoo              :   426                                
##    num_addons       cpu_cores      cpu_speed_mhz   cpu_l2_cache_kb 
##  Min.   : 0.000   Min.   : 1.000   Min.   :  768   Min.   : 128.0  
##  1st Qu.: 4.000   1st Qu.: 2.000   1st Qu.: 2261   1st Qu.: 256.0  
##  Median : 5.000   Median : 3.000   Median : 2712   Median : 256.0  
##  Mean   : 5.678   Mean   : 3.192   Mean   : 2713   Mean   : 610.8  
##  3rd Qu.: 6.000   3rd Qu.: 4.000   3rd Qu.: 3193   3rd Qu.: 512.0  
##  Max.   :71.000   Max.   :50.000   Max.   :28900   Max.   :6144.0  
##                                                                    
##  cpu_vendor       memory_mb       os_version      is_wow64     
##  AMD  : 38180   Min.   :    512   Other:    16   False:244710  
##  Intel:219506   1st Qu.:   4021   6.1  : 65859   True : 12987  
##  Other:    11   Median :   8073   6.2  :  1819                 
##                 Mean   :   9720   6.3  : 13387                 
##                 3rd Qu.:  12180   10.0 :176616                 
##                 Max.   :1572801                                
##                                                                
##  FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS
##  Min.   :    1            Min.   :   28.78       
##  1st Qu.: 1815            1st Qu.: 1627.44       
##  Median : 2520            Median : 2328.42       
##  Mean   : 2894            Mean   : 3039.55       
##  3rd Qu.: 3590            3rd Qu.: 3547.01       
##  Max.   :10000            Max.   :47816.25       
##                                                  
##  TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS
##  Min.   :   23.71                  Min.   :   28.95         
##  1st Qu.: 1105.90                  1st Qu.: 1542.78         
##  Median : 1592.19                  Median : 2182.18         
##  Mean   : 2146.62                  Mean   : 2837.31         
##  3rd Qu.: 2481.38                  3rd Qu.: 3301.45         
##  Max.   :48375.15                  Max.   :47816.25         
##                                                             
##  TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS   profile_age_cat 
##  Min.   :   19.86           Min.   :    16.71          < 1 week  : 8309  
##  1st Qu.:  973.66           1st Qu.:   775.88          < 1 month : 9838  
##  Median : 1362.24           Median :  1093.90          < 6 months:35158  
##  Mean   : 1761.98           Mean   :  1448.36          < 2 years :81811  
##  3rd Qu.: 2037.65           3rd Qu.:  1652.08          < 5 years :87688  
##  Max.   :41404.35           Max.   :100000.00          > 5 years :34893  
##                                                                          
##  distro_id_norm      timezone_cat     memory_cat     cpu_speed_cat   
##  acer   :  2790   (-6,-4]  :157717   < 1GB :   316   < 1GHz :   448  
##  Mozilla:251379   (-8,-6]  : 57182   < 2GB :  5300   < 2GHz : 40596  
##  other  :  2313   (0,2]    : 36782   < 4GB : 66569   < 3GHz :130993  
##  Yahoo  :  1215   (4,6]    :  2209   < 6GB : 16419   < 4GHz : 81353  
##                   [-12,-10]:   957   < 16GB:156709   > 16GHz:  4307  
##                   (6,8]    :   797   > 16GB: 12384                   
##                   (Other)  :  2053                                   
##  cpu_cores_cat is_release     cpu_l2_cache_kb_cat
##  1   :  3101   Mode:logical   < 1024: 22578      
##  2   :124514   TRUE:257697    < 256 :192910      
##  < 4 :112698                  < 512 : 13930      
##  < 8 : 16276                  > 1024: 28279      
##  < 16:   900                                     
##  > 16:   208                                     
## 
summary(df_v_beta)
##   client_id         num_active_days content_crashes  active_hours   
##  Length:70345       Min.   :1.000   Min.   :0       Min.   :0.0000  
##  Class :character   1st Qu.:3.000   1st Qu.:0       1st Qu.:0.2074  
##  Mode  :character   Median :5.000   Median :0       Median :0.5028  
##                     Mean   :4.913   Mean   :0       Mean   :0.7988  
##                     3rd Qu.:7.000   3rd Qu.:0       3rd Qu.:1.0601  
##                     Max.   :8.000   Max.   :0       Max.   :7.5403  
##                                                                     
##    uri_count       session_length     search_count    num_bookmarks    
##  Min.   :   1.00   Min.   :  0.020   Min.   : 0.000   Min.   :    0.0  
##  1st Qu.:  33.25   1st Qu.:  2.213   1st Qu.: 0.000   1st Qu.:    9.0  
##  Median :  80.50   Median :  7.154   Median : 0.750   Median :   23.0  
##  Mean   : 146.33   Mean   : 12.337   Mean   : 2.324   Mean   :  225.4  
##  3rd Qu.: 179.00   3rd Qu.: 18.739   3rd Qu.: 2.667   3rd Qu.:   85.0  
##  Max.   :2983.00   Max.   :286.698   Max.   :50.000   Max.   :39519.0  
##                                                                        
##    num_pages        daily_unique_domains daily_max_tabs   
##  Min.   :     0.0   Min.   : 1.000       Min.   :  0.400  
##  1st Qu.:   543.3   1st Qu.: 2.125       1st Qu.:  2.500  
##  Median :  3347.5   Median : 3.500       Median :  4.125  
##  Mean   : 15614.0   Mean   : 5.148       Mean   :  9.020  
##  3rd Qu.: 15660.6   3rd Qu.: 6.167       3rd Qu.:  7.750  
##  Max.   :177583.0   Max.   :49.292       Max.   :910.400  
##                                                           
##  daily_tabs_opened   startup_ms       daily_num_sessions_started
##  Min.   :  1.00    Min.   :     289   Min.   : 0.000            
##  1st Qu.:  3.50    1st Qu.:    2219   1st Qu.: 1.000            
##  Median :  8.50    Median :    5016   Median : 1.667            
##  Mean   : 20.03    Mean   :   50073   Mean   : 2.398            
##  3rd Qu.: 21.00    3rd Qu.:   11210   3rd Qu.: 3.000            
##  Max.   :554.00    Max.   :50660200   Max.   :32.000            
##                                                                 
##  active_hours_max  uri_count_max     session_length_max search_count_max
##  Min.   : 0.0000   Min.   :    1.0   Min.   :  0.0411   Min.   :  0.0   
##  1st Qu.: 0.3917   1st Qu.:   57.0   1st Qu.:  4.0028   1st Qu.:  0.0   
##  Median : 0.9625   Median :  152.0   Median : 12.9614   Median :  2.0   
##  Mean   : 1.4712   Mean   :  287.3   Mean   : 22.2730   Mean   :  5.1   
##  3rd Qu.: 2.0014   3rd Qu.:  352.0   3rd Qu.: 28.9514   3rd Qu.:  6.0   
##  Max.   :31.1278   Max.   :17548.0   Max.   :922.2850   Max.   :313.0   
##                                                                         
##  num_pages_max    daily_unique_domains_max daily_max_tabs_max
##  Min.   :     0   Min.   :  1.000          Min.   :   1.00   
##  1st Qu.:   620   1st Qu.:  3.000          1st Qu.:   4.00   
##  Median :  3513   Median :  5.200          Median :   6.00   
##  Mean   : 15780   Mean   :  8.582          Mean   :  12.83   
##  3rd Qu.: 15872   3rd Qu.: 10.500          3rd Qu.:  11.00   
##  Max.   :182555   Max.   :100.000          Max.   :1779.00   
##                                                              
##  daily_tabs_opened_max startup_ms_max      daily_num_sessions_started_max
##  Min.   :   1.0        Min.   :      289   Min.   :  0.000               
##  1st Qu.:   5.0        1st Qu.:     3296   1st Qu.:  1.000               
##  Median :  15.0        Median :     7801   Median :  3.000               
##  Mean   :  37.3        Mean   :   142010   Mean   :  4.141               
##  3rd Qu.:  39.0        3rd Qu.:    18500   3rd Qu.:  5.000               
##  Max.   :2551.0        Max.   :251665490   Max.   :110.000               
##                                                                          
##      label        install_year   profile_age     fxa_configured
##  beta   :70345   Min.   :2000   Min.   :   0.0   False:59564   
##  release:    0   1st Qu.:2017   1st Qu.: 213.0   True :10781   
##                  Median :2018   Median : 690.0                 
##                  Mean   :2017   Mean   : 875.3                 
##                  3rd Qu.:2019   3rd Qu.:1329.0                 
##                  Max.   :2019   Max.   :7095.0                 
##                                                                
##  sync_configured is_default_browser   locale      normalized_channel
##  False:58484     False:32258        en-GB: 2997   beta   :70345     
##  True :11861     True :38087        en-US:67348   release:    0     
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##   app_version         default_search_engine country    timezone_offset 
##  Min.   :68   Bing               : 1011     GB:20569   Min.   :-720.0  
##  1st Qu.:68   DuckDuckGo         : 2936     US:49776   1st Qu.:-300.0  
##  Median :68   Google             :62032                Median :-240.0  
##  Mean   :68   missing            :   35                Mean   :-129.9  
##  3rd Qu.:68   other (bundled)    :   63                3rd Qu.:  60.0  
##  Max.   :68   other (non-bundled): 4266                Max.   : 840.0  
##               Yahoo              :    2                                
##    num_addons       cpu_cores      cpu_speed_mhz   cpu_l2_cache_kb 
##  Min.   : 0.000   Min.   : 1.000   Min.   :  633   Min.   : 128.0  
##  1st Qu.: 5.000   1st Qu.: 2.000   1st Qu.: 2195   1st Qu.: 256.0  
##  Median : 6.000   Median : 2.000   Median : 2594   Median : 256.0  
##  Mean   : 6.883   Mean   : 2.954   Mean   : 2656   Mean   : 674.6  
##  3rd Qu.: 7.000   3rd Qu.: 4.000   3rd Qu.: 3192   3rd Qu.: 512.0  
##  Max.   :76.000   Max.   :32.000   Max.   :37221   Max.   :6144.0  
##                                                                    
##  cpu_vendor      memory_mb      os_version     is_wow64    
##  AMD  :10123   Min.   :   511   Other:   12   False:50246  
##  Intel:60149   1st Qu.:  3981   6.1  :20330   True :20099  
##  Other:   73   Median :  7973   6.2  :  781                
##                Mean   :  8796   6.3  : 4768                
##                3rd Qu.:  8189   10.0 :44454                
##                Max.   :294902                              
##                                                            
##  FX_PAGE_LOAD_MS_2_PARENT TIME_TO_DOM_COMPLETE_MS
##  Min.   :    1            Min.   :   34          
##  1st Qu.: 2044            1st Qu.: 1902          
##  Median : 3045            Median : 3024          
##  Mean   : 3593            Mean   : 4597          
##  3rd Qu.: 4727            3rd Qu.: 5481          
##  Max.   :10000            Max.   :50000          
##                                                  
##  TIME_TO_DOM_CONTENT_LOADED_END_MS TIME_TO_LOAD_EVENT_END_MS
##  Min.   :   30.71                  Min.   :   34            
##  1st Qu.: 1271.67                  1st Qu.: 1816            
##  Median : 2001.34                  Median : 2857            
##  Mean   : 2896.94                  Mean   : 4376            
##  3rd Qu.: 3454.47                  3rd Qu.: 5204            
##  Max.   :41450.65                  Max.   :50000            
##                                                             
##  TIME_TO_DOM_INTERACTIVE_MS TIME_TO_NON_BLANK_PAINT_MS   profile_age_cat 
##  Min.   :   26              Min.   :     4.0           < 1 week  : 1869  
##  1st Qu.: 1143              1st Qu.:   889.2           < 1 month : 3232  
##  Median : 1764              Median :  1358.9           < 6 months:10897  
##  Mean   : 2627              Mean   :  2014.8           < 2 years :20628  
##  3rd Qu.: 3069              3rd Qu.:  2286.5           < 5 years :25035  
##  Max.   :49619              Max.   :100000.0           > 5 years : 8684  
##                                                                          
##  distro_id_norm   timezone_cat    memory_cat    cpu_speed_cat  
##  acer   :    4   (-6,-4]:29163   < 1GB :  396   < 1GHz :  217  
##  Mozilla:70331   (0,2]  :18167   < 2GB : 4078   < 2GHz :12505  
##  other  :    7   (-8,-6]:13002   < 4GB :22890   < 3GHz :36310  
##  Yahoo  :    3   (4,6]  : 4667   < 6GB : 4105   < 4GHz :19935  
##                  (6,8]  : 2289   < 16GB:35628   > 16GHz: 1378  
##                  (2,4]  : 1932   > 16GB: 3248                  
##                  (Other): 1125                                 
##  cpu_cores_cat is_release      cpu_l2_cache_kb_cat
##  1   : 1399    Mode :logical   < 1024: 7181       
##  2   :39571    FALSE:70345     < 256 :50220       
##  < 4 :26174                    < 512 : 3911       
##  < 8 : 2970                    > 1024: 9033       
##  < 16:  209                                       
##  > 16:   22                                       
## 

Step 3 - Analyzing Discrete Variables

Training

## Frequency distribution release dataframe
plot_bar(df_release, ggtheme = theme_minimal(base_size = 15))

## Frequency distribution beta dataframe
plot_bar(df_beta, ggtheme = theme_minimal(base_size = 15))

Validation

## Frequency distribution release dataframe
plot_bar(df_v_release, ggtheme = theme_minimal(base_size = 15))

## Frequency distribution beta dataframe
plot_bar(df_v_beta, ggtheme = theme_minimal(base_size = 15))

Step 4 - Analyzing Continuos Variables

Training

## View histogram of release dataset
plot_histogram(df_release, ggtheme = theme_minimal(base_size = 15))

## View histogram of beta dataset
plot_histogram(df_beta, ggtheme = theme_minimal(base_size = 15))

Validation

## View histogram of release dataset
plot_histogram(df_release, ggtheme = theme_minimal(base_size = 15))

## View histogram of beta dataset
plot_histogram(df_beta, ggtheme = theme_minimal(base_size = 15))

Observations

  • My first impression is that user engagement metrics might be a good path to follow

Ploting Density Curves

## Training
t <- ggplot(data=df_train_f, aes(x=uri_count, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 1000) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="URI Count", y = "Density") +
    theme_ipsum()

## Validation
v <- ggplot(data=df_validate_f, aes(x=uri_count, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 1000) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="URI Count", y = "Density") +
    theme_ipsum()

plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space

## Training
t <- ggplot(data=df_train_f, aes(x=active_hours, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Active Hours", y = "Density") +
    theme_ipsum()

## Validation
v <- ggplot(data=df_validate_f, aes(x=active_hours, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Active Hours", y = "Density") +
    theme_ipsum()

plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space

## Training
t <- ggplot(data=df_train_f, aes(x=num_pages, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 50000) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Num Pages", y = "Density") +
    theme_ipsum()

## Validation
v <- ggplot(data=df_validate_f, aes(x=num_pages, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 50000) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Num Pages", y = "Density") +
    theme_ipsum()

plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space

## Training
t <- ggplot(data=df_train_f, aes(x=session_length, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 75) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Session Length", y = "Density") +
    theme_ipsum()

## Validation
v <- ggplot(data=df_validate_f, aes(x=session_length, group=label, fill=label)) +
    geom_density(adjust=1.5, alpha=0.6) + xlim(0, 75) +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    labs(x="Session Length", y = "Density") +
    theme_ipsum()

plot_grid(t, v, ncol=2, labels = c("Train", "Validate")) ## Set up a 2 x 2 plotting space

User Engagement Continuous Metrics

This section will focus only on user engagement continuous metrics. So, we are going to analyze the following metrics:

  • num_active_days
  • active_hours
  • active_hours_max
  • uri_count
  • uri_count_max
  • session_length
  • session_length_max
  • search_count
  • search_count_max
  • num_bookmarks
  • num_pages
  • num_pages_max
  • num_addons
  • daily_unique_domains
  • daily_unique_domains_max
  • daily_max_tabs
  • daily_max_tabs_max
  • daily_tabs_opened
  • daily_tabs_opened_max
  • daily_num_sessions_started
  • daily_num_sessions_started_max
  • startup_ms
  • install_year
  • profile_age
  • timezone_offset
  • memory_mb
  • cpu_cores
  • cpu_speed_mhz
  • cpu_l2_cache_kb

Training

kable(text_tbl) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
beta_num_active_days release_num_active_days beta_active_hours release_active_hours beta_active_hours_max release_active_hours_max beta_uri_count release_uri_count beta_uri_count_max release_uri_count_max beta_session_length release_session_length beta_session_length_max release_session_length_max beta_search_count release_search_count beta_search_count_max release_search_count_max beta_num_bookmarks release_num_bookmarks beta_num_pages release_num_pages beta_num_pages_max release_num_pages_max beta_daily_unique_domains release_daily_unique_domains beta_daily_max_tabs release_daily_max_tabs beta_daily_tabs_opened release_daily_tabs_opened beta_daily_num_sessions_started release_daily_num_sessions_started beta_daily_unique_domains_max release_daily_unique_domains_max beta_daily_max_tabs_max release_daily_max_tabs_max beta_daily_tabs_opened_max release_daily_tabs_opened_max beta_daily_num_sessions_started_max release_daily_num_sessions_started_max beta_startup_ms release_startup_ms beta_install_year release_install_year beta_profile_age release_profile_age beta_timezone_offset release_timezone_offset beta_memory_mb release_memory_mb beta_cpu_cores release_cpu_cores beta_cpu_speed_mhz release_cpu_speed_mhz beta_cpu_l2_cache_kb release_cpu_l2_cache_kb
Min. 1.000000 1.000000 0.0000000 0.0000000 0.000000 0.0000000 1.00000 1.00000 1.0000 1.0000 0.0166665 0.0192595 0.021389 0.030556 0.0000000 0.000000 0.000000 0.000000 0.0000 0.0000 1.000 0.000 1.00 0.00 1.000000 1.000000 1.000000 0.625000 1.00000 1.000000 0.000000 0.000000 1.00000 1.000000 1.00000 1.000000 1.00000 1.00000 0.000000 0.000000 269.000 261.128 1993.000 2000.000 0.0000 0.0000 -720.000 -720.000 511.000 512.000 1.000000 1.000000 798.000 792.00 128.0000 128.0000
1st Qu. 4.000000 4.000000 0.2250000 0.2686111 0.450000 0.5402778 37.00000 44.33333 68.0000 86.0000 2.5284028 2.1590431 4.877777 4.408542 0.0000000 0.000000 0.000000 0.000000 10.0000 10.0000 686.000 1022.125 785.00 1142.00 2.166667 2.283333 2.600000 2.500000 4.00000 4.000000 1.000000 1.250000 3.00000 3.125000 4.00000 4.000000 6.00000 7.00000 2.000000 2.000000 2102.206 1432.677 2017.000 2016.000 271.0000 257.0000 -300.000 -300.000 3984.000 4011.000 2.000000 2.000000 2200.000 2261.00 256.0000 256.0000
Median 6.000000 6.000000 0.5309524 0.5744444 1.063889 1.1541667 86.66667 96.66667 172.0000 196.0000 7.7105554 6.3351191 14.808889 11.701666 0.8333333 0.875000 2.000000 2.000000 26.0000 26.0000 4185.667 5536.000 4340.00 5705.50 3.562500 3.600000 4.250000 3.714286 9.00000 8.833333 1.666667 2.000000 5.50000 6.000000 6.00000 6.000000 17.00000 17.00000 3.000000 4.000000 5088.010 3231.339 2018.000 2018.000 711.0000 698.0000 -240.000 -240.000 8031.000 8069.000 2.000000 2.000000 2594.000 2712.00 256.0000 256.0000
Mean 5.346169 5.569842 0.8236611 0.8468557 1.577508 1.6251135 152.74550 156.24224 311.0213 321.3891 12.2961990 9.2821806 22.706568 18.210749 2.4506498 2.376504 5.636171 5.434352 242.4878 158.9390 17363.463 17330.600 17558.93 17518.75 5.060464 4.968328 9.603628 6.200080 20.49191 17.092979 2.368895 2.888602 8.74361 8.552061 13.81149 9.317556 39.64786 33.27059 4.281399 5.248573 25835.963 9832.051 2017.138 2017.064 893.7534 894.7365 -143.855 -238.714 8965.156 9443.657 2.975699 3.143089 2678.209 2710.62 679.9325 625.9611
3rd Qu. 8.000000 8.000000 1.1028646 1.1265956 2.165278 2.1902778 188.50000 197.00000 382.0000 400.0000 19.5792560 13.6608531 31.519026 26.128403 2.8333333 3.000000 7.000000 7.000000 96.0000 85.2125 18605.464 19680.625 18885.50 19922.00 6.166667 6.070833 8.000000 6.000000 21.75000 19.166667 2.875000 3.500000 11.00000 11.000000 12.00000 9.000000 42.00000 38.00000 5.000000 6.000000 12618.764 8394.891 2018.000 2018.000 1354.0000 1374.0000 60.000 -240.000 10238.000 12144.000 4.000000 4.000000 3192.000 3193.00 512.0000 512.0000
Max. 8.000000 8.000000 7.2901042 7.1222222 24.983333 23.9666667 2931.00000 2391.25000 15626.0000 18032.0000 240.8048605 91.0663890 1255.382223 384.288333 51.0000000 45.750000 188.000000 217.000000 40401.0000 18632.0000 179657.500 168416.286 180456.00 172543.00 44.000000 39.375000 1012.625000 445.375000 518.25000 347.500000 32.833333 32.250000 100.00000 100.000000 3149.00000 2425.000000 3302.00000 2410.00000 88.000000 100.000000 17109505.514 5358122.833 2019.000 2019.000 7051.0000 6922.0000 840.000 720.000 262078.000 524254.000 36.000000 40.000000 37214.000 15077.00 6144.0000 6144.0000

Validation

kable(text_tbl_v) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")
beta_num_active_days release_num_active_days beta_active_hours release_active_hours beta_active_hours_max release_active_hours_max beta_uri_count release_uri_count beta_uri_count_max release_uri_count_max beta_session_length release_session_length beta_session_length_max release_session_length_max beta_search_count release_search_count beta_search_count_max release_search_count_max beta_num_bookmarks release_num_bookmarks beta_num_pages release_num_pages beta_num_pages_max release_num_pages_max beta_daily_unique_domains release_daily_unique_domains beta_daily_max_tabs release_daily_max_tabs beta_daily_tabs_opened release_daily_tabs_opened beta_daily_num_sessions_started release_daily_num_sessions_started beta_daily_unique_domains_max release_daily_unique_domains_max beta_daily_max_tabs_max release_daily_max_tabs_max beta_daily_tabs_opened_max release_daily_tabs_opened_max beta_daily_num_sessions_started_max release_daily_num_sessions_started_max beta_startup_ms release_startup_ms beta_install_year release_install_year beta_profile_age release_profile_age beta_timezone_offset release_timezone_offset beta_memory_mb release_memory_mb beta_cpu_cores release_cpu_cores beta_cpu_speed_mhz release_cpu_speed_mhz beta_cpu_l2_cache_kb release_cpu_l2_cache_kb
Min. 1.000000 1.000000 0.0000000 0.0000000 0.0000000 0.000000 1.0000 1.00000 1.0000 1.0000 0.0199998 0.0157222 0.041111 0.019722 0.000000 0.000000 0.000000 0.000000 0.0000 0.00000 0.0000 0.0000 0.00 0.00 1.000000 1.000000 0.400000 0.5714286 1.00000 1.000000 0.000000 0.000000 1.00000 1.000000 1.00000 1.000000 1.00000 1.00000 0.000000 0.000000 289.000 2.388333e+02 2000.000 2000.000 0.0000 0.000 -720.0000 -720.0000 511.000 512.000 1.000000 1.000000 633.00 768.000 128.0000 128.0000
1st Qu. 3.000000 4.000000 0.2074074 0.2640873 0.3916667 0.537500 33.2500 44.00000 57.0000 86.0000 2.2133335 2.3199533 4.002778 4.788333 0.000000 0.000000 0.000000 0.000000 9.0000 10.00000 543.3333 991.3333 620.00 1113.00 2.125000 2.287037 2.500000 2.5000000 3.50000 4.000000 1.000000 1.166667 3.00000 3.200000 4.00000 4.000000 5.00000 7.00000 1.000000 2.000000 2218.863 1.567167e+03 2017.000 2017.000 213.0000 235.000 -300.0000 -300.0000 3981.000 4021.000 2.000000 2.000000 2195.00 2261.000 256.0000 256.0000
Median 5.000000 6.000000 0.5027778 0.5751736 0.9625000 1.165278 80.5000 97.42857 152.0000 199.0000 7.1543749 6.7808331 12.961389 12.802778 0.750000 1.000000 2.000000 3.000000 23.0000 25.33333 3347.5000 5308.0000 3513.00 5490.00 3.500000 3.651190 4.125000 3.8000000 8.50000 8.857143 1.666667 2.000000 5.20000 6.000000 6.00000 6.000000 15.00000 17.00000 3.000000 4.000000 5015.522 3.345629e+03 2018.000 2018.000 690.0000 673.000 -240.0000 -240.0000 7973.000 8073.000 2.000000 3.000000 2594.00 2712.000 256.0000 256.0000
Mean 4.912574 5.710307 0.7988445 0.8524956 1.4711888 1.636961 146.3339 158.71931 287.3003 328.3892 12.3367205 9.7067904 22.272966 18.614122 2.324319 2.446479 5.100206 5.633558 225.4153 158.03362 15614.0379 17089.9304 15779.79 17289.08 5.148112 5.112258 9.019717 6.3471824 20.03166 17.187064 2.398131 2.831202 8.58199 8.837358 12.82845 9.539785 37.29553 33.54423 4.141417 5.180103 50072.589 2.727928e+04 2017.255 2017.194 875.2575 883.857 -129.9082 -240.4712 8795.994 9719.802 2.954155 3.191904 2656.31 2712.603 674.5779 610.7777
3rd Qu. 7.000000 8.000000 1.0600694 1.1399306 2.0013889 2.220833 179.0000 200.14286 352.0000 409.0000 18.7390970 14.7740975 28.951388 27.097500 2.666667 3.000000 6.000000 7.000000 85.0000 84.00000 15660.6250 19359.8333 15872.00 19606.00 6.166667 6.250000 7.750000 6.2857143 21.00000 19.500000 3.000000 3.375000 10.50000 11.000000 11.00000 10.000000 39.00000 38.00000 5.000000 6.000000 11210.000 7.770437e+03 2019.000 2019.000 1329.0000 1368.000 60.0000 -240.0000 8189.000 12180.000 4.000000 4.000000 3192.00 3193.000 512.0000 512.0000
Max. 8.000000 8.000000 7.5402778 7.2204861 31.1277778 25.440278 2983.0000 2483.16667 17548.0000 18524.0000 286.6983330 90.4422220 922.285000 524.545556 50.000000 45.000000 313.000000 208.000000 39519.0000 20002.14286 177583.0000 168812.1429 182555.00 170532.00 49.291667 42.400000 910.400000 449.3333333 554.00000 357.000000 32.000000 32.250000 100.00000 100.000000 1779.00000 2215.000000 2551.00000 2342.00000 110.000000 184.000000 50660199.500 2.259481e+07 2019.000 2019.000 7095.0000 6972.000 840.0000 780.0000 294902.000 1572801.000 32.000000 50.000000 37221.00 28900.000 6144.0000 6144.0000

Comparing Two Continuous Distributions

Training

The QQ plot can be used to compare two continuous distributions.

par(mfrow = c(2, 2))  ## Set up a 2 x 2 plotting space

## QQ plot in R to compare two data samples
for (i in user_eng) {
  x <- df_beta_ue[,i]
  y <- df_release_ue[,i]
  
  rg <- range(x, y, na.rm=T)
  
  test <- ks.test(x, y)$statistic
  pvalue <- ks.test(x, y)$p.value
  
  test <- paste("KS Test = ", round(test, 3))
  pvalue <- paste("P-value = ", round(pvalue, 3))
  
  qqplot(x, y, main=i, xlim=rg, ylim=rg, xlab = "Beta", ylab = "Release", pch = 1)
  # mtext(test, side=3)
  text(min(x), max(x), paste(pvalue, "\n", test), adj=c(0,1))
  abline(0,1, col="red")  
}

for (i in user_eng) {
  x <- df_beta_ue[,i]
  y <- df_release_ue[,i]
  
  print(i)
  # print(ad.test(x))
  # print(ad.test(y))
  print(ks.test(x, y))
}
## [1] "num_active_days"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.070619, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "active_hours"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.04493, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "active_hours_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.044246, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "uri_count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.047413, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "uri_count_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.053957, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "session_length"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.098562, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "session_length_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.073339, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "search_count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.014407, p-value = 4.652e-09
## alternative hypothesis: two-sided
## 
## [1] "search_count_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.019136, p-value = 1.221e-15
## alternative hypothesis: two-sided
## 
## [1] "num_bookmarks"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.026078, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "num_pages"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.05463, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "num_pages_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.053836, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "num_addons"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.60116, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_unique_domains"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.029154, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_unique_domains_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.03293, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_max_tabs"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.09362, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_max_tabs_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.07981, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_tabs_opened"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.034583, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_tabs_opened_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.032014, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_num_sessions_started"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.11891, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_num_sessions_started_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.11591, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "startup_ms"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.12852, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "install_year"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.018028, p-value = 6.062e-14
## alternative hypothesis: two-sided
## 
## [1] "profile_age"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.041815, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "timezone_offset"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.20833, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "memory_mb"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.074939, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "cpu_cores"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.064122, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "cpu_speed_mhz"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.0464, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "cpu_l2_cache_kb"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.025401, p-value < 2.2e-16
## alternative hypothesis: two-sided

Validation

The QQ plot can be used to compare two continuous distributions.

par(mfrow = c(2, 2))  ## Set up a 2 x 2 plotting space

## QQ plot in R to compare two data samples
for (i in user_eng) {
  x <- df_beta_v_ue[,i]
  y <- df_release_v_ue[,i]
  
  rg <- range(x, y, na.rm=T)
  
  test <- ks.test(x, y)$statistic
  pvalue <- ks.test(x, y)$p.value
  
  test <- paste("KS Test = ", round(test, 3))
  pvalue <- paste("P-value = ", round(pvalue, 3))
  
  qqplot(x, y, main=i, xlim=rg, ylim=rg, xlab = "Beta", ylab = "Release", pch = 1)
  # mtext(test, side=3)
  text(min(x), max(x), paste(pvalue, "\n", test), adj=c(0,1))
  abline(0,1, col="red")  
}

for (i in user_eng) {
  x <- df_beta_v_ue[,i]
  y <- df_release_v_ue[,i]
  
  print(i)
  # print(ad.test(x))
  # print(ad.test(y))
  print(ks.test(x, y))
}
## [1] "num_active_days"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.16585, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "active_hours"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.057657, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "active_hours_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.076585, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "uri_count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.07132, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "uri_count_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.094754, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "session_length"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.075533, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "session_length_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.039646, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "search_count"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.045374, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "search_count_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.058507, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "num_bookmarks"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.042296, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "num_pages"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.081639, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "num_pages_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.082435, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "num_addons"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.29246, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_unique_domains"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.039839, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_unique_domains_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.054168, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_max_tabs"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.074689, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_max_tabs_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.050656, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_tabs_opened"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.036161, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_tabs_opened_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.052805, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_num_sessions_started"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.097817, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "daily_num_sessions_started_max"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.11694, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "startup_ms"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.1303, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "install_year"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.016919, p-value = 3.642e-14
## alternative hypothesis: two-sided
## 
## [1] "profile_age"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.035079, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "timezone_offset"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.23513, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "memory_mb"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.10938, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "cpu_cores"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.087202, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "cpu_speed_mhz"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.061914, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## [1] "cpu_l2_cache_kb"
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.034682, p-value < 2.2e-16
## alternative hypothesis: two-sided

Observations

  • Analyzing only the plots, in general, the distributions are very similar to each other
  • Mainly the metrics related to active hours and number of active days, search count, number of pages, daily unique domains and daily number of sessions started
  • Comparing only the training and validation plots, we can notice that the variables that presented a more different behavior were: num_active_days, uri_count_max and num_addons
  • When I performed the KS (kolmogorov-Smirnov) test, the null hypothesis that both samples come from the same distribution was rejected for all the comparisons. Am I doing something wrong?

User Engagement Discrete Metrics

This section will focus only on user engagement continuous metrics. So, we are going to analyze the following metrics:

  • default_search_engine
  • is_default_browser
  • profile_age_cat
  • distro_id_norm
  • memory_cat
  • cpu_speed_cat
  • cpu_cores_cat
  • cpu_l2_cache_kb_cat
  • cpu_vendor
  • os_version
  • is_wow64
  • fxa_configured
  • sync_configured
  • locale
  • country
  • timezone_cat
  • label
  • normalized_channel
  • is_release

Comparing Two Discrete Distributions

Training

par(mfrow = c(2, 2))  ## Set up a 2 x 2 plotting space

## QQ plot in R to compare two data samples
for (i in user_eng_dis) {
  x <- df_beta_ue_dis[,i]
  y <- df_release_ue_dis[,i]
  
  rel_beta <- table(x)/nrow(df_beta_ue_dis) #divide the frequency counts by the total
  beta_bar <- barplot(rel_beta,
        main = "Beta", #Give your chart a title
        ylim=c(0,1), border=F, col = "navy",
        xlab = i, #Label the x axis
        ylab = "Relative Frequency" #Label the y axis
  ) 
  # Add the text 
  text(beta_bar, rel_beta+0.025, paste(round(rel_beta*100), "%", sep="") ,cex=1) 
  
  rel_release <- table(y)/nrow(df_release_ue_dis) #divide the frequency counts by the total
  release_bar <- barplot(rel_release,
        main = "Release", #Give your chart a title
        ylim=c(0,1), border=F, col = "navy",
        xlab = i, #Label the x axis
        ylab = "Relative Frequency" #Label the y axis
  )
  
  # Add the text 
  text(release_bar, rel_release+0.025, paste(round(rel_release*100), "%", sep="") ,cex=1) 
}

Validation

par(mfrow = c(2, 2))  ## Set up a 2 x 2 plotting space

## QQ plot in R to compare two data samples
for (i in user_eng_dis) {
  x <- df_beta_v_ue_dis[,i]
  y <- df_release_v_ue_dis[,i]
  
  rel_beta <- table(x)/nrow(df_beta_v_ue_dis) #divide the frequency counts by the total
  beta_bar <- barplot(rel_beta,
        main = "Beta", #Give your chart a title
        ylim=c(0,1), border=F, col = "navy",
        xlab = i, #Label the x axis
        ylab = "Relative Frequency" #Label the y axis
  ) 
  # Add the text 
  text(beta_bar, rel_beta+0.025, paste(round(rel_beta*100), "%", sep="") ,cex=1) 
  
  rel_release <- table(y)/nrow(df_release_v_ue_dis) #divide the frequency counts by the total
  release_bar <- barplot(rel_release,
        main = "Release", #Give your chart a title
        ylim=c(0,1), border=F, col = "navy",
        xlab = i, #Label the x axis
        ylab = "Relative Frequency" #Label the y axis
  )
  
  # Add the text 
  text(release_bar, rel_release+0.025, paste(round(rel_release*100), "%", sep="") ,cex=1) 
}

Observations

  • Analyzing only the plots, in general, the distributions are very similar to each other